{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 08 Marginal and conditional distributions" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%%html\n", "" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import plotly.graph_objects as go\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import findspark\n", "\n", "findspark.init()\n", "from pyspark.context import SparkContext\n", "from pyspark.sql import functions as F\n", "from pyspark.sql.session import SparkSession\n", "\n", "spark = SparkSession.builder.appName(\"statistics\").master(\"local\").getOrCreate()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[khanacademy](https://www.khanacademy.org/math/ap-statistics/analyzing-categorical-ap/distributions-two-way-tables/v/marginal-distribution-and-conditional-distribution?modal=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "![Marginal and conditional distributions fig 1](./imgs/01-07-01.png)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "dataset = {\n", " \"#\": [\"80-100\", \"60-79\", \"40-59\", \"20-39\", \"0-19\"],\n", " \"0-20\": [0, 0, 2, 10, 2],\n", " \"21-40\": [4, 20, 4, 2, 0],\n", " \"41-60\": [16, 30, 32, 8, 0],\n", " \">60\": [20, 10, 32, 0, 8],\n", "}" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0-2021-4041-60>60
#
80-100041620
60-790203010
40-59243232
20-3910280
0-192008
Total14308670
\n", "
" ], "text/plain": [ " 0-20 21-40 41-60 >60\n", "# \n", "80-100 0 4 16 20\n", "60-79 0 20 30 10\n", "40-59 2 4 32 32\n", "20-39 10 2 8 0\n", "0-19 2 0 0 8\n", "Total 14 30 86 70" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(dataset).set_index(\"#\")\n", "df = df.append(df.sum().rename(\"Total\"))\n", "df" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+----+-----+-----+---+\n", "| #|0-20|21-40|41-60|>60|\n", "+------+----+-----+-----+---+\n", "|80-100| 0| 4| 16| 20|\n", "| 60-79| 0| 20| 30| 10|\n", "| 40-59| 2| 4| 32| 32|\n", "| 20-39| 10| 2| 8| 0|\n", "| 0-19| 2| 0| 0| 8|\n", "+------+----+-----+-----+---+\n", "\n" ] } ], "source": [ "sdf = spark.createDataFrame(zip(*dataset.values()), schema=list(dataset.keys()))\n", "sdf.registerTempTable(\"sdf_table\")\n", "sdf.show()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0-2021-4041-60>60
80-1000.04.016.020.0
60-790.020.030.010.0
40-592.04.032.032.0
20-3910.02.08.00.0
0-192.00.00.08.0
Total14.030.086.070.0
Percentage7.015.043.035.0
\n", "
" ], "text/plain": [ " 0-20 21-40 41-60 >60\n", "80-100 0.0 4.0 16.0 20.0\n", "60-79 0.0 20.0 30.0 10.0\n", "40-59 2.0 4.0 32.0 32.0\n", "20-39 10.0 2.0 8.0 0.0\n", "0-19 2.0 0.0 0.0 8.0\n", "Total 14.0 30.0 86.0 70.0\n", "Percentage 7.0 15.0 43.0 35.0" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.append(\n", " pd.DataFrame(\n", " [df.iloc[-1] / df.iloc[-1].sum() * 100],\n", " index=[\"Percentage\"],\n", " columns=df.columns,\n", " )\n", ")\n", "df" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0-2021-4041-60>60TotalPercentage
80-1000.04.016.020.040.08.0
60-790.020.030.010.060.012.0
40-592.04.032.032.070.014.0
20-3910.02.08.00.020.04.0
0-192.00.00.08.010.02.0
Total14.030.086.070.0200.0NaN
Percentage7.015.043.035.0100.0NaN
\n", "
" ], "text/plain": [ " 0-20 21-40 41-60 >60 Total Percentage\n", "80-100 0.0 4.0 16.0 20.0 40.0 8.0\n", "60-79 0.0 20.0 30.0 10.0 60.0 12.0\n", "40-59 2.0 4.0 32.0 32.0 70.0 14.0\n", "20-39 10.0 2.0 8.0 0.0 20.0 4.0\n", "0-19 2.0 0.0 0.0 8.0 10.0 2.0\n", "Total 14.0 30.0 86.0 70.0 200.0 NaN\n", "Percentage 7.0 15.0 43.0 35.0 100.0 NaN" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"Total\"] = df.sum(axis=1)\n", "df[\"Percentage\"] = df[\"Total\"] / df[\"Total\"].sum() * 100\n", "df.loc[\"Total\"][\"Percentage\"] = np.nan\n", "df.loc[\"Percentage\"][\"Percentage\"] = np.nan\n", "df" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+----+-----+-----+---+-----+----------+\n", "| #|0-20|21-40|41-60|>60|Total|Percentage|\n", "+------+----+-----+-----+---+-----+----------+\n", "|80-100| 0| 4| 16| 20| 40| 20.0|\n", "| 60-79| 0| 20| 30| 10| 60| 30.0|\n", "| 40-59| 2| 4| 32| 32| 70| 35.0|\n", "| 20-39| 10| 2| 8| 0| 20| 10.0|\n", "| 0-19| 2| 0| 0| 8| 10| 5.0|\n", "+------+----+-----+-----+---+-----+----------+\n", "\n" ] } ], "source": [ "sdf = sdf.withColumn(\n", " \"Total\", F.col(\"0-20\") + F.col(\"21-40\") + F.col(\"41-60\") + F.col(\">60\")\n", ")\n", "sdf = sdf.withColumn(\n", " \"Percentage\", F.col(\"Total\") / sdf.select(F.sum(\"Total\")).collect()[0][0] * 100\n", ")\n", "sdf.show()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0-2021-4041-60>60TotalPercentage
Total14.030.086.070.0200.0NaN
Percentage7.015.043.035.0100.0NaN
\n", "
" ], "text/plain": [ " 0-20 21-40 41-60 >60 Total Percentage\n", "Total 14.0 30.0 86.0 70.0 200.0 NaN\n", "Percentage 7.0 15.0 43.0 35.0 100.0 NaN" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "marginal_dist_v_df = df.iloc[-2:]\n", "marginal_dist_v_df" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TotalPercentage
80-10040.08.0
60-7960.012.0
40-5970.014.0
20-3920.04.0
0-1910.02.0
Total200.0NaN
Percentage100.0NaN
\n", "
" ], "text/plain": [ " Total Percentage\n", "80-100 40.0 8.0\n", "60-79 60.0 12.0\n", "40-59 70.0 14.0\n", "20-39 20.0 4.0\n", "0-19 10.0 2.0\n", "Total 200.0 NaN\n", "Percentage 100.0 NaN" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "marginal_dist_h_df = df.loc[:, \"Total\":]\n", "marginal_dist_h_df" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-----+----------+\n", "| #|Total|Percentage|\n", "+------+-----+----------+\n", "|80-100| 40| 20.0|\n", "| 60-79| 60| 30.0|\n", "| 40-59| 70| 35.0|\n", "| 20-39| 20| 10.0|\n", "| 0-19| 10| 5.0|\n", "+------+-----+----------+\n", "\n" ] } ], "source": [ "sdf.select(\"#\", \"Total\", \"Percentage\").show()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
41-60Percentage
80-10016.018.604651
60-7930.034.883721
40-5932.037.209302
20-398.09.302326
0-190.00.000000
\n", "
" ], "text/plain": [ " 41-60 Percentage\n", "80-100 16.0 18.604651\n", "60-79 30.0 34.883721\n", "40-59 32.0 37.209302\n", "20-39 8.0 9.302326\n", "0-19 0.0 0.000000" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "conditional_dist_df = pd.DataFrame(df.loc[:, \"41-60\"])\n", "conditional_dist_df = conditional_dist_df.iloc[:-2]\n", "conditional_dist_df[\"Percentage\"] = (\n", " conditional_dist_df[\"41-60\"] / conditional_dist_df[\"41-60\"].sum() * 100\n", ")\n", "conditional_dist_df" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "+------+-----+------------------+\n", "| #|41-60| Percentage|\n", "+------+-----+------------------+\n", "|80-100| 16| 18.6046511627907|\n", "| 60-79| 30|34.883720930232556|\n", "| 40-59| 32| 37.2093023255814|\n", "| 20-39| 8| 9.30232558139535|\n", "| 0-19| 0| 0.0|\n", "+------+-----+------------------+\n", "\n" ] } ], "source": [ "sdf.select(\"#\", \"41-60\").withColumn(\n", " \"Percentage\", F.col(\"41-60\") / sdf.select(F.sum(\"41-60\")).collect()[0][0] * 100\n", ").show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 4 }